https://www.tidytextmining.com/
library(rtweet)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages ------------------------------------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.2 v purrr 0.3.4
v tibble 3.0.4 v dplyr 1.0.2
v tidyr 1.1.2 v stringr 1.4.0
v readr 1.4.0 v forcats 0.5.0
-- Conflicts ---------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x purrr::flatten() masks rtweet::flatten()
x dplyr::lag() masks stats::lag()
library(tidytext)
library(wordcloud2)
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
search_tweets
tweet_collection <- search_tweets("marchmadness", n=1000, lang = "en")
Requesting token on behalf of user...
Waiting for authentication in browser...
Press Esc/Ctrl + C to abort
Authentication complete.
Downloading [=======>---------------------------------] 20%
Downloading [===========>-----------------------------] 30%
Downloading [===============>-------------------------] 40%
Downloading [===================>---------------------] 50%
Downloading [========================>----------------] 60%
Downloading [============================>------------] 70%
Downloading [================================>--------] 80%
Downloading [====================================>----] 90%
Downloading [=========================================] 100%
tweet_collection <- tweet_collection %>%
filter(is_retweet == "FALSE")
tweet_collection
tweets_by_tweeter <- tweet_collection %>%
group_by(screen_name) %>%
mutate(line = row_number()) %>%
ungroup()
tweets_by_tweeter %>%
count(screen_name, sort = TRUE)
glimpse(tweets_by_tweeter)
Rows: 321
Columns: 91
$ user_id <chr> "1321835395231920129", "85961526", "85961526", "85961526", "85961526", ...
$ status_id <chr> "1321838635323445248", "1321838549977751552", "1321113228781518848", "1...
$ created_at <dttm> 2020-10-29 15:37:43, 2020-10-29 15:37:23, 2020-10-27 15:35:13, 2020-10...
$ screen_name <chr> "koenig_reese", "TheAndyKatz", "TheAndyKatz", "TheAndyKatz", "TheAndyKa...
$ text <chr> "@marchmadness @TheAndyKatz @ZagMBB @NovaMBB @DukeMBB @TexasMBB @MSU_Ba...
$ source <chr> "Twitter for iPhone", "Twitter Web App", "Twitter for iPhone", "Twitter...
$ display_text_width <dbl> 18, 92, 181, 120, 101, 27, 247, 65, 6, 220, 15, 25, 28, 15, 19, 63, 4, ...
$ reply_to_status_id <chr> "1321834235561451520", NA, NA, NA, NA, "1321834235561451520", NA, "1321...
$ reply_to_user_id <chr> "202416362", NA, NA, NA, NA, "202416362", NA, "20179206", "202416362", ...
$ reply_to_screen_name <chr> "marchmadness", NA, NA, NA, NA, "marchmadness", NA, "peterp2000", "marc...
$ is_quote <lgl> FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE...
$ is_retweet <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F...
$ favorite_count <int> 0, 1, 11, 7, 0, 0, 2, 0, 0, 1, 0, 0, 7, 1, 0, 0, 0, 8, 21, 33, 1223, 12...
$ retweet_count <int> 0, 0, 6, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 7, 4, 223, 14, 0,...
$ quote_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ reply_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ hashtags <list> [NA, NA, NA, NA, NA, "WVU", NA, NA, NA, NA, NA, NA, <"ZagUp", "GoZags"...
$ symbols <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ urls_url <list> [NA, "twitter.com/marchmadness/s…", "twitter.com/marchmadness/s…", "fa...
$ urls_t.co <list> [NA, "https://t.co/N2Oh7TsEyY", "https://t.co/9cdID9pfPQ", "https://t....
$ urls_expanded_url <list> [NA, "https://twitter.com/marchmadness/status/1321834235561451520?s=20...
$ media_url <list> [NA, NA, NA, NA, NA, NA, NA, NA, "http://pbs.twimg.com/tweet_video_thu...
$ media_t.co <list> [NA, NA, NA, NA, NA, NA, NA, NA, "https://t.co/vmpqMz28ZE", NA, NA, NA...
$ media_expanded_url <list> [NA, NA, NA, NA, NA, NA, NA, NA, "https://twitter.com/Tonycastilla99/s...
$ media_type <list> [NA, NA, NA, NA, NA, NA, NA, NA, "photo", NA, NA, NA, NA, NA, NA, NA, ...
$ ext_media_url <list> [NA, NA, NA, NA, NA, NA, NA, NA, "http://pbs.twimg.com/tweet_video_thu...
$ ext_media_t.co <list> [NA, NA, NA, NA, NA, NA, NA, NA, "https://t.co/vmpqMz28ZE", NA, NA, NA...
$ ext_media_expanded_url <list> [NA, NA, NA, NA, NA, NA, NA, NA, "https://twitter.com/Tonycastilla99/s...
$ ext_media_type <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ mentions_user_id <list> [<"202416362", "85961526", "602989093", "1581277519", "18272699", "188...
$ mentions_screen_name <list> [<"marchmadness", "TheAndyKatz", "ZagMBB", "NovaMBB", "DukeMBB", "Texa...
$ lang <chr> "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en",...
$ quoted_status_id <chr> NA, "1321834235561451520", "1321100218943840261", NA, NA, NA, "13218342...
$ quoted_text <chr> NA, "Who will have a breakout season? \U0001f440\U0001f914\n\n@TheAndyK...
$ quoted_created_at <dttm> NA, 2020-10-29 15:20:14, 2020-10-27 14:43:31, NA, NA, NA, 2020-10-29 1...
$ quoted_source <chr> NA, "Twitter Web App", "Twitter Web App", NA, NA, NA, "Twitter Web App"...
$ quoted_favorite_count <int> NA, 123, 21, NA, NA, NA, 123, NA, NA, NA, 123, 123, 123, 123, 123, NA, ...
$ quoted_retweet_count <int> NA, 14, 7, NA, NA, NA, 14, NA, NA, NA, 14, 14, 14, 14, 14, NA, NA, 14, ...
$ quoted_user_id <chr> NA, "202416362", "202416362", NA, NA, NA, "202416362", NA, NA, NA, "202...
$ quoted_screen_name <chr> NA, "marchmadness", "marchmadness", NA, NA, NA, "marchmadness", NA, NA,...
$ quoted_name <chr> NA, "NCAA March Madness", "NCAA March Madness", NA, NA, NA, "NCAA March...
$ quoted_followers_count <int> NA, 1419268, 1419268, NA, NA, NA, 1419268, NA, NA, NA, 1419268, 1419268...
$ quoted_friends_count <int> NA, 815, 815, NA, NA, NA, 815, NA, NA, NA, 815, 815, 815, 815, 815, NA,...
$ quoted_statuses_count <int> NA, 29873, 29873, NA, NA, NA, 29873, NA, NA, NA, 29873, 29873, 29873, 2...
$ quoted_location <chr> NA, "", "", NA, NA, NA, "", NA, NA, NA, "", "", "", "", "", NA, NA, "",...
$ quoted_description <chr> NA, "The official NCAA March Madness destination for all things Divisio...
$ quoted_verified <lgl> NA, TRUE, TRUE, NA, NA, NA, TRUE, NA, NA, NA, TRUE, TRUE, TRUE, TRUE, T...
$ retweet_status_id <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_text <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_created_at <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ retweet_source <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_favorite_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_retweet_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_user_id <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_screen_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_followers_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_friends_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_statuses_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_location <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_description <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_verified <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ place_url <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "https://api.twitter.com/1....
$ place_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Texas", "Tucson", NA, NA, ...
$ place_full_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Texas, USA", "Tucson, AZ",...
$ place_type <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "admin", "city", NA, NA, NA...
$ country <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "United States", "United St...
$ country_code <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "US", "US", NA, NA, NA, NA,...
$ geo_coords <list> [<NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>,...
$ coords_coords <list> [<NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>,...
$ bbox_coords <list> [<NA, NA, NA, NA, NA, NA, NA, NA>, <NA, NA, NA, NA, NA, NA, NA, NA>, <...
$ status_url <chr> "https://twitter.com/koenig_reese/status/1321838635323445248", "https:/...
$ name <chr> "Reese Koenig", "Andy Katz", "Andy Katz", "Andy Katz", "Andy Katz", "Ca...
$ location <chr> "", "", "", "", "", "", "Spokane, WA", "", "United States", "", "Grand ...
$ description <chr> "", "Digital reporter, analyst, host for @MarchMadness, March Madness 3...
$ url <chr> NA, NA, NA, NA, NA, NA, NA, NA, "https://t.co/L6kiKVvYn5", NA, NA, "htt...
$ protected <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F...
$ followers_count <int> 1, 471484, 471484, 471484, 471484, 214, 3673, 17, 465, 61, 1259, 362, 2...
$ friends_count <int> 135, 1387, 1387, 1387, 1387, 1096, 1194, 109, 1231, 1277, 1092, 574, 12...
$ listed_count <int> 0, 6808, 6808, 6808, 6808, 2, 28, 0, 3, 0, 3, 1, 7, 0, 2, 1, 3, 244, 38...
$ statuses_count <int> 2, 40140, 40140, 40140, 40140, 3651, 19731, 1086, 46054, 7047, 72039, 2...
$ favourites_count <int> 6, 53, 53, 53, 53, 21105, 24639, 100, 69223, 1219, 30820, 9363, 36398, ...
$ account_created_at <dttm> 2020-10-29 15:25:06, 2009-10-29 01:13:45, 2009-10-29 01:13:45, 2009-10...
$ verified <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE...
$ profile_url <chr> NA, NA, NA, NA, NA, NA, NA, NA, "https://t.co/L6kiKVvYn5", NA, NA, "htt...
$ profile_expanded_url <chr> NA, NA, NA, NA, NA, NA, NA, NA, "http://tonycastilla.com", NA, NA, "htt...
$ account_lang <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ profile_banner_url <chr> NA, "https://pbs.twimg.com/profile_banners/85961526/1535831124", "https...
$ profile_background_url <chr> NA, "http://abs.twimg.com/images/themes/theme1/bg.png", "http://abs.twi...
$ profile_image_url <chr> "http://pbs.twimg.com/profile_images/1321835514186604554/qwrBaNDJ_norma...
$ line <int> 1, 1, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 1, 1,...
"Because we have kept text such as hashtags and usernames in the dataset, we can’t use a simple anti_join() to remove stop words. Instead, we can take the approach shown in the filter() line that uses str_detect() from the stringr package. – https://www.tidytextmining.com/twitter.html
head(stopwordslangs)
tweets_tokenized %>%
count(word, sort = TRUE, name = "freq") %>%
filter(!str_detect(word, "^\\@")) %>%
anti_join(stopwordslangs) # anti_join(tidytext::get_stopwords())
Joining, by = "word"
frequency <- tweets_tokenized %>%
group_by(screen_name) %>%
count(word, sort = TRUE) %>%
left_join(tweets_tokenized %>%
group_by(screen_name) %>%
summarise(total = n())) %>%
mutate(freq = n/total)
`summarise()` ungrouping output (override with `.groups` argument)
Joining, by = "screen_name"
frequency
"This is a nice and tidy data frame but we would actually like to plot those frequencies on the x- and y-axes of a plot, so we will need to use spread() from tidyr make a differently shaped data frame. – https://www.tidytextmining.com/twitter.html
pivot_wider
frequency <- frequency %>%
select(screen_name, word, freq) %>%
pivot_wider(names_from = screen_name, values_from = freq) #, values_fill = 0)
frequency
tweets_tokenized %>%
# group_by(screen_name) %>%
count(word, sort = TRUE, name = "freq") %>%
filter(!str_detect(word, "^\\@")) %>%
anti_join(stopwordslangs) %>%
wordcloud2()
Joining, by = "word"
tweets_tokenized %>%
count(word, sort = TRUE, name = "freq") %>%
filter(!str_detect(word, "^\\@")) %>%
slice_head(n = 30) %>%
ggplot(aes(freq, fct_reorder(word, freq))) +
geom_col()
tweets_tokenized %>%
count(word, sort = TRUE, name = "freq") %>%
anti_join(stopwordslangs) %>%
filter(!str_detect(word, "^\\@")) %>%
slice_head(n = 30) %>%
ggplot(aes(freq, fct_reorder(word, freq))) +
geom_col()
Joining, by = "word"
ggplot(frequency, aes(LouInPain, Dukeballnation)) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.25, height = 0.25) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = scales::percent_format()) +
scale_y_log10(labels = scales::percent_format()) +
geom_abline(color = "firebrick")
# fs::dir_create("images")
# ggsave("images/dukeball.png")
# "CBBCent1" | screen_name == "Adam_Bradford1
# marchmadness TheAndyKatz
ggplot(frequency, aes(marchmadness, TheAndyKatz)) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.25, height = 0.25) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = scales::percent_format()) +
scale_y_log10(labels = scales::percent_format()) +
geom_abline(color = "firebrick")
tweets_by_tweeter %>%
summarise(min_date = min(created_at), max_date = max(created_at))
word_ratios <- tweets_tokenized %>%
# filter(screen_name == "CBBCent1" | screen_name == "Adam_Bradford14") %>%
filter(screen_name == "LouInPain" | screen_name == "Dukeballnation") %>%
filter(!str_detect(word, "^@")) %>%
count(word, screen_name) %>%
group_by(word) %>%
filter(sum(n) >= 2) %>%
ungroup() %>%
pivot_wider(names_from = screen_name, values_from = n, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(LouInPain / Dukeballnation)) %>%
arrange(desc(logratio))
word_ratios
word_ratios %>%
arrange(abs(logratio))
word_ratios %>%
group_by(logratio < 0) %>%
top_n(15, abs(logratio)) %>%
ungroup() %>%
mutate(word = reorder(word, logratio)) %>%
ggplot(aes(word, logratio, fill = logratio < 0)) +
geom_col() + #show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (CCBCent1/Adam_Bradford14)") +
scale_fill_discrete(name = "", labels = c("LouInPain", "Dukeballnation"))
https://www.tidytextmining.com/twitter.html#favorites-and-retweets
https://www.tidytextmining.com/twitter.html#changes-in-word-use
# dtm <- DocumentTermMatrix(docs)
dtm2 <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm2)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
d <- d %>%
slice(2:200)
https://www.tidytextmining.com/tfidf.html#the-bind_tf_idf-function
tweet_words <- tweets_by_tweeter %>%
select(screen_name, text, status_id, user_id) %>%
unnest_tokens(word, text, token = "tweets") %>%
filter(!str_detect(word, "^\\@")) %>%
filter(!str_detect(word, "^http")) %>%
anti_join(stopwordslangs) %>%
count(word, tweeter = screen_name, sort = TRUE)
Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
Joining, by = "word"
tweet_words
total_words <- tweet_words %>%
group_by(tweeter) %>%
summarize(total = sum(n)) %>%
arrange(-total)
`summarise()` ungrouping output (override with `.groups` argument)
total_words
tweet_words <- left_join(tweet_words, total_words)
Joining, by = "tweeter"
tweet_words
tweet_words %>%
bind_tf_idf(word, tweeter, n)
tweet_words %>%
bind_tf_idf(word, tweeter, n) %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
filter(n > 2) %>%
# group_by(tweeter) %>%
# top_n(2) %>%
# ungroup() %>%
ggplot(aes(word, tf_idf)) +
geom_col() +
facet_wrap(~ tweeter) +
coord_flip()
http://antonio-ferraro.eu.pn/word-clouds-in-r-packages-wordcloud2-and-tm/
https://jrnold.github.io/qss-tidy/discovery.html#textual-data
https://rstudio-pubs-static.s3.amazonaws.com/31867_8236987cf0a8444e962ccd2aec46d9c3.html
of less use